This code covers chapter 3 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar.
This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.
Load the iris data set
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Get summary statistics
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Get mean and standard deviation for sepal length
mean(iris$Sepal.Length)
## [1] 5.843333
sd(iris$Sepal.Length)
## [1] 0.8280661
Apply mean, sd and median to columns (MARGIN=2)
apply(iris[1:4], MARGIN=2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
apply(iris[1:4], MARGIN=2, median)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.80 3.00 4.35 1.30
apply(iris[1:4], MARGIN=2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.8280661 0.4358663 1.7652982 0.7622377
apply(iris[1:4], MARGIN=2, var)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.6856935 0.1899794 3.1162779 0.5810063
apply(iris[1:4], MARGIN=2, min)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 4.3 2.0 1.0 0.1
apply(iris[1:4], MARGIN=2, max)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 7.9 4.4 6.9 2.5
Discretize the data first since there are too many values (cut divides the range by breaks, see package discretization for other methods)
iris_discrete <- data.frame(
Sepal.Length= cut(iris$Sepal.Length, breaks=3,
labels=c("small", "medium", "large"), ordered=TRUE),
Sepal.Width= cut(iris$Sepal.Width, breaks=3,
labels=c("small", "medium", "large"), ordered=TRUE),
Petal.Length= cut(iris$Petal.Length, breaks=3,
labels=c("small", "medium", "large"), ordered=TRUE),
Petal.Width= cut(iris$Petal.Width, breaks=3,
labels=c("small", "medium", "large"), ordered=TRUE),
Species = iris$Species
)
head(iris_discrete)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 small medium small small setosa
## 2 small medium small small setosa
## 3 small medium small small setosa
## 4 small medium small small setosa
## 5 small medium small small setosa
## 6 small large small small setosa
summary(iris_discrete)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## small :59 small :47 small :50 small :50 setosa :50
## medium:71 medium:88 medium:54 medium:54 versicolor:50
## large :20 large :15 large :46 large :46 virginica :50
Create some tables
table(iris_discrete$Sepal.Length, iris_discrete$Sepal.Width)
##
## small medium large
## small 12 37 10
## medium 31 37 3
## large 4 14 2
table(iris_discrete$Petal.Length, iris_discrete$Petal.Width)
##
## small medium large
## small 50 0 0
## medium 0 48 6
## large 0 6 40
table(iris_discrete$Petal.Length, iris_discrete$Species)
##
## setosa versicolor virginica
## small 50 0 0
## medium 0 48 6
## large 0 2 44
#table(iris_discrete)
Test if the two features are independent given the counts in the contingency table (H0: independence)
p-value: the probability of seeing a more extreme value of the test statistic under the assumption that H0 is correct. Low p-values (typically less than .05 or .01) indicate that H0 should be rejected.
tbl <- table(iris_discrete$Sepal.Length, iris_discrete$Sepal.Width)
tbl
##
## small medium large
## small 12 37 10
## medium 31 37 3
## large 4 14 2
chisq.test(tbl)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 12.879, df = 4, p-value = 0.01188
Fisher’s exact test is better for small counts (cells with counts <5)
fisher.test(tbl)
##
## Fisher's Exact Test for Count Data
##
## data: tbl
## p-value = 0.01063
## alternative hypothesis: two.sided
Plot the distribution for a discrete variable
table(iris_discrete$Sepal.Length)
##
## small medium large
## 59 71 20
barplot(table(iris_discrete$Sepal.Length))
apply(iris[1:4], MARGIN=2, quantile)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0% 4.3 2.0 1.00 0.1
## 25% 5.1 2.8 1.60 0.3
## 50% 5.8 3.0 4.35 1.3
## 75% 6.4 3.3 5.10 1.8
## 100% 7.9 4.4 6.90 2.5
Interquartile range
quantile(iris$Petal.Length)
## 0% 25% 50% 75% 100%
## 1.00 1.60 4.35 5.10 6.90
quantile(iris$Petal.Length)[4] - quantile(iris$Petal.Length)[2]
## 75%
## 3.5
Define your own statistic: E.g., MAD (median average deviation)
mad <- function(x) median(abs(x-mean(x)))
apply(iris[1:4], MARGIN=2, mad)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.6566667 0.2573333 1.7920000 0.7993333
Show the distribution of a single numeric variable
hist(iris$Petal.Width)
hist(iris$Petal.Width, breaks=20, col="grey")
Show the relationship between two numeric variables
plot(x=iris$Petal.Length, y=iris$Petal.Width, col=iris$Species)
Show the relationship between several numeric variables
pairs(iris, col=iris$Species)
Alternative scatter plot matrix
library("GGally")
ggpairs(iris, color = "Species")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Compare the distribution of continuous variables grouped by a nominal variable
boxplot(iris[,1:4])
Group-wise averages
aggregate(Sepal.Length ~ Species, data=iris, FUN = mean)
## Species Sepal.Length
## 1 setosa 5.006
## 2 versicolor 5.936
## 3 virginica 6.588
aggregate(Sepal.Width ~ Species, data=iris, FUN = mean)
## Species Sepal.Width
## 1 setosa 3.428
## 2 versicolor 2.770
## 3 virginica 2.974
e <- ecdf(iris$Petal.Width)
hist(iris$Petal.Width, breaks=20, freq=FALSE, col="gray")
lines(e, col="red", lwd=2)
iris_matrix <- as.matrix(iris[,1:4])
image(iris_matrix)
library(seriation) ## for pimage
pimage(iris_matrix, ylab="Object (ordered by species)",
main="Original values", colorkey=TRUE)
values smaller than the average are blue and larger ones are red
iris_scaled <- scale(iris_matrix)
pimage(iris_scaled,
ylab="Object (ordered by species)",
main="Standard deviations from the feature mean")
use reordering of features and objects
pimage(iris_scaled, order = seriate(iris_scaled),
main="Standard deviations (reordered)")
Calculate and visualize the correlation between features
cm1 <- cor(iris_matrix)
cm1
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
library(seriation) ## for pimage and hmap
pimage(cm1)
hmap(cm1, margin = c(7,7), cexRow = 1, cexCol = 1)
library(corrplot)
corrplot(cm1, method="ellipse")
corrplot(cm1, method=c("ellipse"), order="FPC")
Test if correlation is significantly different from 0
cor.test(iris$Sepal.Length, iris$Sepal.Width)
##
## Pearson's product-moment correlation
##
## data: iris$Sepal.Length and iris$Sepal.Width
## t = -1.4403, df = 148, p-value = 0.1519
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.27269325 0.04351158
## sample estimates:
## cor
## -0.1175698
cor.test(iris$Petal.Length, iris$Petal.Width) #this one is significant
##
## Pearson's product-moment correlation
##
## data: iris$Petal.Length and iris$Petal.Width
## t = 43.387, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9490525 0.9729853
## sample estimates:
## cor
## 0.9628654
Correlation between objects
cm2 <- cor(t(iris_matrix))
pimage(cm2,
main="Correlation matrix", xlab="Objects", ylab="Objects",
zlim = c(-1,1),col = bluered(100))
library(MASS)
parcoord(iris[,1:4], col=iris$Species)
Reorder with placing correlated features next to each other
library(seriation)
o <- seriate(as.dist(1-cor(iris[,1:4])), method="BBURCG")
get_order(o)
## [1] 3 4 1 2
parcoord(iris[,get_order(o)], col=iris$Species)
Create some fake crime data. The example adapted from ? ggmap. Each observation has a GPS coordinate (lon/lat) and a crime type. The data is center at SMU. You can use the URL in Google Maps to find GPS coordinates.
loc <- c(lon = -96.7860835, lat = 32.8422109);
crimes <- c("burglary", "car theft", "intoxication")
observations <- NULL
for(k in 1:length(crimes)){
a <- rnorm(2); b <- rnorm(2);
si <- 1/50000 * (outer(a,a) + outer(b,b))
observations <- rbind(
observations,
cbind(MASS::mvrnorm(rpois(1,100), jitter(loc, .0005), si), k)
)
}
observations <- data.frame(observations)[sample(1:nrow(observations)),]
names(observations) <- c('lon', 'lat','crime')
observations$crime <- factor(observations$crime, labels = crimes)
head(observations)
## lon lat crime
## 14 -96.79307 32.84458 burglary
## 277 -96.78621 32.84735 intoxication
## 64 -96.79049 32.84124 burglary
## 315 -96.77526 32.86041 intoxication
## 173 -96.78370 32.83095 car theft
## 294 -96.77558 32.85697 intoxication
plot(observations[,c("lon", "lat")], col = observations[, "crime"])
Use package ggmap to display the data on a (Google) map.
library("ggmap")
## Loading required package: ggplot2
get map (centered around SMU)
SMU_map <- get_map(location = loc, zoom = 14)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=32.842211,-96.786084&zoom=14&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
plot (see http://ggplot2.org/ to learn about ggplot-style plotting using layers)
Note: Points that fall outside the map produce a warning.
ggmap(SMU_map) +
geom_point(aes(x = lon, y = lat, color = crime),
data = observations, alpha = .5) +
scale_color_discrete(labels = levels(observations[,"crime"])) +
ggtitle("Reported crimes around the SMU Campus")
## Warning: Removed 11 rows containing missing values (geom_point).
plot the density
ggmap(SMU_map) +
stat_density2d(aes(fill = ..level.., alpha = ..level.., x = lon, y = lat),
geom="polygon", data = observations) +
scale_alpha(guide = 'none')+
ggtitle("Crime density around the SMU Campus")
## Warning: Removed 11 rows containing non-finite values (stat_density2d).
Look at some example maps at http://rgraphgallery.blogspot.com/search/label/map